library(readxl)
sm <- read_excel("/Users/user/Downloads/social_media_cleaned.xlsx")
str(sm)
## tibble [21 × 10] (S3: tbl_df/tbl/data.frame)
## $ character : chr [1:21] "masinl" "peace" "Patty" "Bunny" ...
## $ Instagram : num [1:21] 3.5 7.73 3.77 5.38 12 ...
## $ LinkedIn : num [1:21] 4 5.2 7 5.317 0.583 ...
## $ SnapChat : num [1:21] 1 3.683 0.533 1.3 0 ...
## $ Twitter : num [1:21] 5 0 0 0 0.667 ...
## $ Whatsapp/Wechat : num [1:21] 1 4.18 9.83 5.3 3 ...
## $ youtube : num [1:21] 2.5 4.25 1.85 2 3.5 7 3 2 4 3 ...
## $ OTT : num [1:21] 14.5 0 2 2 2 3 0 3 3 0 ...
## $ Reddit : num [1:21] 2.5 0 0 0 1 0 0 0 0 0 ...
## $ How you felt the entire week?: num [1:21] 3 3 4 4 3 5 4 3 3 2 ...
#Dependent Variable: How you felt the entire week? #Independent Variables:Instagram,LinkedIn,SnapChat,Twitter,Whatsapp,youtube,OTT,Reddit
sm1 <- sm[, 2:9]
sm1
## # A tibble: 21 × 8
## Instagram LinkedIn SnapChat Twitter `Whatsapp/Wechat` youtube OTT Reddit
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.5 4 1 5 1 2.5 14.5 2.5
## 2 7.73 5.2 3.68 0 4.18 4.25 0 0
## 3 3.77 7 0.533 0 9.83 1.85 2 0
## 4 5.38 5.32 1.3 0 5.3 2 2 0
## 5 12 0.583 0 0.667 3 3.5 2 1
## 6 2.33 7 0.467 0 12 7 3 0
## 7 5.37 4 0 0 6 3 0 0
## 8 7 4 3 0 10 2 3 0
## 9 8.65 10 3.83 0 6.15 4 3 0
## 10 0.167 0 0 0 1 3 0 0
## # ℹ 11 more rows
#load necessary libraries
library(magrittr)
library(NbClust)
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Hierarchical Clustering- Dendrogram
sm_scaled <- scale(sm1)
dist_matrix <- dist(sm_scaled)
#Clustering Single
hc <- hclust(dist_matrix,method = "single")
fviz_dend(hc)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Default Clustering
hc <- hclust(dist_matrix)
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")
#Average Clustering
hc <- hclust(dist_matrix,method = "average")
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")
#By observing the above dendrogram’s k=2 clusters will be sufficient.This is confirmed further with D index graphical representation.
num_clusters <- 2
clusters <- cutree(hc, k = num_clusters)
# Membership for each cluster
table(clusters)
## clusters
## 1 2
## 1 20
# Principal Components
pca_result <- prcomp(sm1,scale=TRUE)
pca_result
## Standard deviations (1, .., p=8):
## [1] 1.4936735 1.3063036 1.1303414 0.9384308 0.9007674 0.7694819 0.6078953
## [8] 0.3621675
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5
## Instagram 0.43537868 -0.15356106 0.377944297 0.03258545 -0.36643468
## LinkedIn 0.35649468 -0.20991651 -0.329259391 -0.19907722 0.70255046
## SnapChat 0.15947567 0.03392372 0.717791182 0.10281712 0.51019404
## Twitter -0.39046510 -0.53815939 0.040482193 -0.33406772 -0.09484716
## Whatsapp/Wechat 0.52694693 0.06079220 0.007167203 0.09132133 -0.28308505
## youtube 0.45370070 -0.20727847 -0.406762139 -0.03615301 -0.09453089
## OTT 0.08162766 -0.68423619 0.216480569 -0.12793328 -0.07632718
## Reddit -0.12361633 -0.35601755 -0.139687974 0.90062217 0.08919336
## PC6 PC7 PC8
## Instagram -0.51184307 -0.48893799 -0.08742222
## LinkedIn 0.02630366 -0.42220705 -0.09096859
## SnapChat -0.04887482 0.39304838 -0.17449187
## Twitter -0.02585534 0.04614556 -0.65794190
## Whatsapp/Wechat 0.68926770 0.02263817 -0.39306576
## youtube -0.39774620 0.64519354 -0.03187491
## OTT 0.31710521 0.06567287 0.59264873
## Reddit -0.02048963 -0.07045776 -0.11831273
#Non-Hierarchical Clustering(k-means)
num_clusters <- 2
kmeans_model <- kmeans(sm1, centers = num_clusters)
# Membership for each cluster
table(kmeans_model$cluster)
##
## 1 2
## 2 19
# Visualize cluster and membership using first two Principal Components
fviz_cluster(list(data = pca_result$x[, 1:2], cluster = kmeans_model$cluster))
# Visualize cluster centers for k-means
fviz_cluster(kmeans_model, data = sm_scaled, geom = "point", frame.type = "convex",
pointsize = 2, fill = "white", main = "K-means Cluster Centers")
## Warning: argument frame is deprecated; please use ellipse instead.
## Warning: argument frame.type is deprecated; please use ellipse.type instead.
# Visualize cluster and membership using first two Principal Components for k-means
pca_result <- prcomp(sm_scaled, scale = TRUE)
fviz_cluster(kmeans_model, data = pca_result$x[, 1:2], geom = "point",
pointsize = 2, fill = "white", main = "K-means Clustering Result (PCA)")
# Calculate silhouette information for k-means clustering
sil <- silhouette(kmeans_model$cluster, dist(sm_scaled))
# Visualize the silhouette plot for k-means clustering
fviz_silhouette(sil, main = "Silhouette Plot for K-means Clustering")
## cluster size ave.sil.width
## 1 1 2 -0.19
## 2 2 19 0.42
#optimal cluster method/visualization
res.nbclust <- sm1 %>% scale() %>% NbClust(distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index ="all")
## Warning in pf(beale, pp, df2): NaNs produced
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 4 proposed 8 as the best number of clusters
## * 4 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
#From cluster analysis I am able to recognize users whose social media usage patterns are similar to mine.
#Get the correlations between the variables
cor(sm1, use = "complete.obs")
## Instagram LinkedIn SnapChat Twitter Whatsapp/Wechat
## Instagram 1.00000000 0.097056399 0.28968877 -0.1930565 0.3776962
## LinkedIn 0.09705640 1.000000000 0.02552545 -0.1300685 0.2288356
## SnapChat 0.28968877 0.025525452 1.00000000 -0.1799569 0.0809998
## Twitter -0.19305653 -0.130068464 -0.17995686 1.0000000 -0.4958329
## Whatsapp/Wechat 0.37769615 0.228835623 0.08099980 -0.4958329 1.0000000
## youtube 0.33000187 0.452197669 -0.16006877 -0.1881777 0.3716852
## OTT 0.26738122 0.185492527 0.13158590 0.5570740 0.1336204
## Reddit -0.07461553 -0.006992884 -0.08116237 0.1649030 -0.1344497
## youtube OTT Reddit
## Instagram 0.33000187 0.2673812 -0.074615529
## LinkedIn 0.45219767 0.1854925 -0.006992884
## SnapChat -0.16006877 0.1315859 -0.081162369
## Twitter -0.18817769 0.5570740 0.164902964
## Whatsapp/Wechat 0.37168516 0.1336204 -0.134449660
## youtube 1.00000000 0.1605652 0.026399913
## OTT 0.16056523 1.0000000 0.232791099
## Reddit 0.02639991 0.2327911 1.000000000
#Computing Principal Components
social_pca <- prcomp(sm1,scale=TRUE)
social_pca
## Standard deviations (1, .., p=8):
## [1] 1.4936735 1.3063036 1.1303414 0.9384308 0.9007674 0.7694819 0.6078953
## [8] 0.3621675
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5
## Instagram 0.43537868 -0.15356106 0.377944297 0.03258545 -0.36643468
## LinkedIn 0.35649468 -0.20991651 -0.329259391 -0.19907722 0.70255046
## SnapChat 0.15947567 0.03392372 0.717791182 0.10281712 0.51019404
## Twitter -0.39046510 -0.53815939 0.040482193 -0.33406772 -0.09484716
## Whatsapp/Wechat 0.52694693 0.06079220 0.007167203 0.09132133 -0.28308505
## youtube 0.45370070 -0.20727847 -0.406762139 -0.03615301 -0.09453089
## OTT 0.08162766 -0.68423619 0.216480569 -0.12793328 -0.07632718
## Reddit -0.12361633 -0.35601755 -0.139687974 0.90062217 0.08919336
## PC6 PC7 PC8
## Instagram -0.51184307 -0.48893799 -0.08742222
## LinkedIn 0.02630366 -0.42220705 -0.09096859
## SnapChat -0.04887482 0.39304838 -0.17449187
## Twitter -0.02585534 0.04614556 -0.65794190
## Whatsapp/Wechat 0.68926770 0.02263817 -0.39306576
## youtube -0.39774620 0.64519354 -0.03187491
## OTT 0.31710521 0.06567287 0.59264873
## Reddit -0.02048963 -0.07045776 -0.11831273
summary(social_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion 0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
## PC8
## Standard deviation 0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion 1.0000
eigen_social<- social_pca$sdev^2
eigen_social
## [1] 2.2310606 1.7064291 1.2776717 0.8806523 0.8113819 0.5921023 0.3695367
## [8] 0.1311653
#From PCA variate representation of each PC, It’s evident that PC1 and PC2 add arround 50% of the to total variance.
#Screeplot
plot(eigen_social, xlab = "Component number", ylab = "Component variance", type = "l", main = "Scree diagram")
plot(log(eigen_social), xlab = "Component number", ylab = "Component variance", type = "l", main = "Scree diagram")
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.2
res.pca <- PCA(sm_scaled, graph = FALSE)
fviz_eig(res.pca, addlabels = TRUE)
###From the screeplot elbow it is benefial to consider PC1,PC2,PC3,PC4,PC5,PC6 as it covers 93% of total variance.
#Visualization using PC’s
library(FactoMineR)
library("factoextra")
res.pca <- PCA(sm1, graph = FALSE)
fviz_pca_var(res.pca, col.var = "black")
#From the above I can tell that most of my classmates usage timings of the apps Instagram,Whatsapp/Wechat, LinkedIn, Youtube and snapchat are similar. Most probably, twitter and reddit are not used by me and my classmates.
# load library for factor analysis
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#Decide how many Factors are ideal for your dataset?
fa.parallel(sm1)
## Parallel analysis suggests that the number of factors = 0 and the number of components = 0
#Parallel analysis suggests that the number of factors = 0 and the number of components = 0
#Explain the output for your factor model?
fit.pc <- principal(sm1, nfactors=2, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## Instagram 0.68 0.01 0.463 0.54 1.0
## LinkedIn 0.59 0.11 0.359 0.64 1.1
## SnapChat 0.22 -0.11 0.059 0.94 1.5
## Twitter -0.36 0.84 0.834 0.17 1.4
## Whatsapp/Wechat 0.73 -0.30 0.626 0.37 1.3
## youtube 0.73 0.07 0.533 0.47 1.0
## OTT 0.37 0.82 0.814 0.19 1.4
## Reddit -0.04 0.50 0.250 0.75 1.0
##
## RC1 RC2
## SS loadings 2.19 1.75
## Proportion Var 0.27 0.22
## Cumulative Var 0.27 0.49
## Proportion Explained 0.56 0.44
## Cumulative Proportion 0.56 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.13
## with the empirical chi square 21.06 with prob < 0.072
##
## Fit based upon off diagonal values = 0.71
#High absolute values (close to 1) indicate a strong relationship between the variable and the factor. #h2 explains how much variance of the variables are explained by the factors. #u2 indicates the amount of variance not explained by the factors Principal Components Analysis Call: principal(r = sm1, nfactors = 2, rotate = “varimax”) Standardized loadings (pattern matrix) based upon correlation matrix
RC1 RC2
SS loadings 2.27 1.80 Proportion Var 0.25 0.20 Cumulative Var 0.25 0.45 Proportion Explained 0.56 0.44 Cumulative Proportion 0.56 1.00
Mean item complexity = 1.3 Test of the hypothesis that 2 components are sufficient.
The root mean square of the residuals (RMSR) is 0.14 with the empirical chi square 29.01 with prob < 0.066
round(fit.pc$values, 3)
## [1] 2.231 1.706 1.278 0.881 0.811 0.592 0.370 0.131
fit.pc$loadings
##
## Loadings:
## RC1 RC2
## Instagram 0.681
## LinkedIn 0.589 0.111
## SnapChat 0.216 -0.110
## Twitter -0.359 0.840
## Whatsapp/Wechat 0.732 -0.300
## youtube 0.727
## OTT 0.371 0.822
## Reddit 0.498
##
## RC1 RC2
## SS loadings 2.189 1.749
## Proportion Var 0.274 0.219
## Cumulative Var 0.274 0.492
# Communalities
fit.pc$communality
## Instagram LinkedIn SnapChat Twitter Whatsapp/Wechat
## 0.46314710 0.35873574 0.05870521 0.83436255 0.62581187
## youtube OTT Reddit
## 0.53256680 0.81378028 0.25038016
# Rotated factor scores, Notice the columns ordering: RC1, RC2
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## Instagram 0.68 0.01 0.463 0.54 1.0
## LinkedIn 0.59 0.11 0.359 0.64 1.1
## SnapChat 0.22 -0.11 0.059 0.94 1.5
## Twitter -0.36 0.84 0.834 0.17 1.4
## Whatsapp/Wechat 0.73 -0.30 0.626 0.37 1.3
## youtube 0.73 0.07 0.533 0.47 1.0
## OTT 0.37 0.82 0.814 0.19 1.4
## Reddit -0.04 0.50 0.250 0.75 1.0
##
## RC1 RC2
## SS loadings 2.19 1.75
## Proportion Var 0.27 0.22
## Cumulative Var 0.27 0.49
## Proportion Explained 0.56 0.44
## Cumulative Proportion 0.56 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.13
## with the empirical chi square 21.06 with prob < 0.072
##
## Fit based upon off diagonal values = 0.71
fit.pc$scores
## RC1 RC2
## [1,] -0.49593354 3.81935345
## [2,] 0.37453109 -0.45810128
## [3,] 0.24035060 -0.39724117
## [4,] -0.11821501 -0.28545411
## [5,] -0.05535776 0.18730345
## [6,] 1.33982873 -0.15394397
## [7,] -0.16544326 -0.57370308
## [8,] 0.36239954 -0.37616101
## [9,] 1.27403174 0.04737682
## [10,] -1.47503111 -0.57670717
## [11,] -1.47090422 0.33142379
## [12,] -0.99564293 0.65382335
## [13,] -0.64587802 -0.84729946
## [14,] -0.54403365 -0.04349169
## [15,] -0.05025441 0.89199339
## [16,] 0.46518890 -0.19297427
## [17,] 0.63875357 -0.56313735
## [18,] -0.42630523 -0.44123138
## [19,] 0.50350291 -0.59787800
## [20,] 2.63240375 0.51258979
## [21,] -1.38799166 -0.93654010
fa.plot(fit.pc) # See Correlations within Factors
Show the columns that go into each factor?
fa.diagram(fit.pc) # Visualize the relationship
Perform some visualizations using the factors
#very simple structure visualization
vss(sm1)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
##
## Very Simple Structure
## Call: vss(x = sm1)
## VSS complexity 1 achieves a maximimum of 0.65 with 5 factors
## VSS complexity 2 achieves a maximimum of 0.78 with 7 factors
##
## The Velicer MAP achieves a minimum of 0.07 with 1 factors
## BIC achieves a minimum of -35.54 with 1 factors
## Sample Size adjusted BIC achieves a minimum of 1.7 with 4 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex
## 1 0.39 0.00 0.071 20 2.5e+01 0.19 7.0 0.39 0.1 -35.5 26.2 1.0
## 2 0.52 0.64 0.090 13 8.9e+00 0.78 4.2 0.64 0.0 -30.7 9.4 1.2
## 3 0.55 0.69 0.117 7 4.7e+00 0.70 2.9 0.75 0.0 -16.6 5.0 1.5
## 4 0.60 0.74 0.180 2 1.6e+00 0.45 2.1 0.81 0.0 -4.5 1.7 1.5
## 5 0.65 0.78 0.271 -2 6.1e-01 NA 1.4 0.87 NA NA NA 1.3
## 6 0.62 0.78 0.443 -5 3.9e-10 NA 1.2 0.90 NA NA NA 1.5
## 7 0.62 0.78 1.000 -7 0.0e+00 NA 1.2 0.90 NA NA NA 1.5
## 8 0.62 0.78 NA -8 0.0e+00 NA 1.2 0.90 NA NA NA 1.5
## eChisq SRMR eCRMS eBIC
## 1 3.2e+01 1.7e-01 0.196 -28.5
## 2 9.2e+00 8.8e-02 0.130 -30.4
## 3 2.1e+00 4.2e-02 0.084 -19.2
## 4 7.9e-01 2.6e-02 0.097 -5.3
## 5 2.2e-01 1.4e-02 NA NA
## 6 1.7e-10 3.8e-07 NA NA
## 7 2.2e-18 4.3e-11 NA NA
## 8 2.2e-18 4.3e-11 NA NA
Very Simple Structure Call: vss(x = sm1) VSS complexity 1 achieves a maximimum of 0.61 with 6 factors VSS complexity 2 achieves a maximimum of 0.78 with 7 factors
The Velicer MAP achieves a minimum of 0.06 with 1 factors BIC achieves a minimum of -53.17 with 1 factors Sample Size adjusted BIC achieves a minimum of 1.47 with 5 factors
Statistics by number of factors
# Computing Correlation Matrix
corrm.social <- cor(sm1)
corrm.social
## Instagram LinkedIn SnapChat Twitter Whatsapp/Wechat
## Instagram 1.00000000 0.097056399 0.28968877 -0.1930565 0.3776962
## LinkedIn 0.09705640 1.000000000 0.02552545 -0.1300685 0.2288356
## SnapChat 0.28968877 0.025525452 1.00000000 -0.1799569 0.0809998
## Twitter -0.19305653 -0.130068464 -0.17995686 1.0000000 -0.4958329
## Whatsapp/Wechat 0.37769615 0.228835623 0.08099980 -0.4958329 1.0000000
## youtube 0.33000187 0.452197669 -0.16006877 -0.1881777 0.3716852
## OTT 0.26738122 0.185492527 0.13158590 0.5570740 0.1336204
## Reddit -0.07461553 -0.006992884 -0.08116237 0.1649030 -0.1344497
## youtube OTT Reddit
## Instagram 0.33000187 0.2673812 -0.074615529
## LinkedIn 0.45219767 0.1854925 -0.006992884
## SnapChat -0.16006877 0.1315859 -0.081162369
## Twitter -0.18817769 0.5570740 0.164902964
## Whatsapp/Wechat 0.37168516 0.1336204 -0.134449660
## youtube 1.00000000 0.1605652 0.026399913
## OTT 0.16056523 1.0000000 0.232791099
## Reddit 0.02639991 0.2327911 1.000000000
plot(corrm.social)
social_pca <- prcomp(sm1, scale=TRUE)
summary(social_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion 0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
## PC8
## Standard deviation 0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion 1.0000
plot(social_pca)
#Biplot Visualization
biplot(fit.pc)
#I feel factor analysis is not beneficial for the social media data because I observed that we are missing the most part of the uniqueness of these apps by including factors and we are able to capture only a small portion of variances by using factors. #And parallel analysis screeplot indicated that the ideal number of factors for the social media data is zero. #From the component analysis we got similar results to PCA, where the apps like Instagram, whatsapp/wechat, LinkedIn, Youtube, Snapchat usages are a bit similar and high compared to OTT, Twitter and Reddit.
sm2 <- sm[, 2:10]
sm2
## # A tibble: 21 × 9
## Instagram LinkedIn SnapChat Twitter `Whatsapp/Wechat` youtube OTT Reddit
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.5 4 1 5 1 2.5 14.5 2.5
## 2 7.73 5.2 3.68 0 4.18 4.25 0 0
## 3 3.77 7 0.533 0 9.83 1.85 2 0
## 4 5.38 5.32 1.3 0 5.3 2 2 0
## 5 12 0.583 0 0.667 3 3.5 2 1
## 6 2.33 7 0.467 0 12 7 3 0
## 7 5.37 4 0 0 6 3 0 0
## 8 7 4 3 0 10 2 3 0
## 9 8.65 10 3.83 0 6.15 4 3 0
## 10 0.167 0 0 0 1 3 0 0
## # ℹ 11 more rows
## # ℹ 1 more variable: `How you felt the entire week?` <dbl>
# Performing multiple regression on the dataset
fit <- lm(sm2$`How you felt the entire week?` ~ sm2$Instagram+ sm2$LinkedIn + sm2$SnapChat + sm2$Twitter+ sm2$`Whatsapp/Wechat`+ sm2$youtube + sm2$OTT + sm2$Reddit , data=sm2)
#show the results
summary(fit)
##
## Call:
## lm(formula = sm2$`How you felt the entire week?` ~ sm2$Instagram +
## sm2$LinkedIn + sm2$SnapChat + sm2$Twitter + sm2$`Whatsapp/Wechat` +
## sm2$youtube + sm2$OTT + sm2$Reddit, data = sm2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0693 -0.3702 -0.0812 0.4818 1.4647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.68725 0.67260 3.995 0.00178 **
## sm2$Instagram -0.03358 0.06371 -0.527 0.60778
## sm2$LinkedIn 0.12267 0.08573 1.431 0.17799
## sm2$SnapChat 0.04470 0.06202 0.721 0.48488
## sm2$Twitter 0.18243 0.26988 0.676 0.51188
## sm2$`Whatsapp/Wechat` 0.04077 0.06708 0.608 0.55467
## sm2$youtube 0.06912 0.13459 0.514 0.61692
## sm2$OTT -0.08423 0.09193 -0.916 0.37757
## sm2$Reddit -0.02812 0.12209 -0.230 0.82174
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8115 on 12 degrees of freedom
## Multiple R-squared: 0.2784, Adjusted R-squared: -0.2026
## F-statistic: 0.5788 on 8 and 12 DF, p-value: 0.7774
#From the above summary we got p-value 0.7774 which indicates the coefficient of the predictor variable associated with this p-value is not statistically significant.
coefficients(fit)
## (Intercept) sm2$Instagram sm2$LinkedIn
## 2.68724656 -0.03357685 0.12266943
## sm2$SnapChat sm2$Twitter sm2$`Whatsapp/Wechat`
## 0.04469741 0.18242992 0.04076703
## sm2$youtube sm2$OTT sm2$Reddit
## 0.06911591 -0.08423071 -0.02811749
###From the above we get information about the dependent variable in equation form y=b0+ b1x1 + b2x2+…+bnxn where intercept b0=2.68, and cofficients b1=-0.033,….
fitted(fit)
## 1 2 3 4 5 6 7 8
## 2.939170 3.694387 3.803577 3.402626 2.645129 4.208769 3.449678 3.370189
## 9 10 11 12 13 14 15 16
## 4.069330 2.929765 3.364427 3.609704 3.518215 3.308767 3.177730 3.535305
## 17 18 19 20 21
## 3.151638 2.908330 3.696886 3.135180 3.081197
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data=sm2, title="Social-Media")
plot(fit, which=1) # Residuals vs Fitted
plot(fit, which=2) # Normal Q-Q plot
residuals <- residuals(fit)
#Plot residuals against fitted values to check for homoscedasticity
plot_resid_fitted <- ggplot() +
geom_point(aes(x = fitted(fit), y = residuals)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals",
title = "Residuals vs Fitted Values Plot") +
theme_minimal()
print(plot_resid_fitted)
#The residual vs. fitted plot is a tool used to evaluate the assumptions
and adequacy of a regression model. It helps to identify whether the
model adequately captures the underlying relationships in the data or if
there are issues that need to be addressed. #The plot shows a pattern of
points around zero, the model is likely not appropriate.
predict.lm(fit, data.frame(Instagram=8, LinkedIn=5, SnapChat=4, Twitter=4,
Whatsapp=4, youtube=8, OTT=3, Reddit=4 ))
## Warning: 'newdata' had 1 row but variables found have 21 rows
## 1 2 3 4 5 6 7 8
## 2.939170 3.694387 3.803577 3.402626 2.645129 4.208769 3.449678 3.370189
## 9 10 11 12 13 14 15 16
## 4.069330 2.929765 3.364427 3.609704 3.518215 3.308767 3.177730 3.535305
## 17 18 19 20 21
## 3.151638 2.908330 3.696886 3.135180 3.081197
#Make predictions using the model
predicted <- predict(fit, newdata = sm2)
#Calculating RMSE by taking the square root of the mean of the squared differences between the actual values and the predicted values (predicted)
rmse <- sqrt(mean((sm2$`How you felt the entire week?` - predicted)^2))
rmse
## [1] 0.6134638
#Low RMSE(0.613) between 0 and 1 indicates that the models predictions are quite accurate, with small deviations from the actual values.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
#Nonlinearity
# component + residual plot
crPlots(fit)
# plot studentized residuals vs. fitted values
library(car)
spreadLevelPlot(fit)
##
## Suggested power transformation: -3.296001
#Load required Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Create binary outcome variable: high mpg (1) vs. low mpg (0)
sm <- sm %>%
mutate(new_prediction = ifelse(sm$`How you felt the entire week?` > median(sm$`How you felt the entire week?`), 1, 0))
# Performing logistic regression on the dataset
logistic_simple <- glm(new_prediction ~ sm$Instagram + sm$LinkedIn + sm$SnapChat + sm$Twitter + sm$`Whatsapp/Wechat` + sm$youtube + sm$OTT + sm$Reddit , data = sm, family = binomial)
#Building the model by considering the entire week energy in binary format as new_prediction and building the model by considering the effect of instagram,LinkedIn, SnapChat, Twitter, Whatsapp, Youtube, OTT and reddit
summary(logistic_simple)
##
## Call:
## glm(formula = new_prediction ~ sm$Instagram + sm$LinkedIn + sm$SnapChat +
## sm$Twitter + sm$`Whatsapp/Wechat` + sm$youtube + sm$OTT +
## sm$Reddit, family = binomial, data = sm)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.23745 2.17500 -0.109 0.913
## sm$Instagram -0.43775 0.38043 -1.151 0.250
## sm$LinkedIn 0.73143 0.52598 1.391 0.164
## sm$SnapChat 0.19505 0.19313 1.010 0.313
## sm$Twitter 0.27309 0.77374 0.353 0.724
## sm$`Whatsapp/Wechat` -0.01691 0.26452 -0.064 0.949
## sm$youtube -0.12817 0.39594 -0.324 0.746
## sm$OTT -0.74612 0.84516 -0.883 0.377
## sm$Reddit -0.23678 0.65219 -0.363 0.717
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26.734 on 20 degrees of freedom
## Residual deviance: 18.723 on 12 degrees of freedom
## AIC: 36.723
##
## Number of Fisher Scoring iterations: 7
#From the above it is evident that the model is not acceptable since the independent variables are not contributing significantly to the variation in the dependent variable.
residuals(logistic_simple)
## 1 2 3 4 5 6
## -0.015465560 -1.300023003 0.651494433 1.302715508 -0.040411531 0.930204652
## 7 8 9 10 11 12
## 1.240408254 -0.405977332 -1.703079144 -0.893619423 1.214683951 -0.961805036
## 13 14 15 16 17 18
## 0.814212616 -0.686344843 -0.334900417 1.689663089 -0.077603376 -0.242897674
## 19 20 21
## -0.911316594 -0.006144281 -1.115579423
plot(logistic_simple, which = 1)
#There is a fixed pattern in the residuals vs fitted plot which means
that the selected independent variables will not explain the dependent
variable well.
# Make predictions on the same dataset
predicted_values <- predict(logistic_simple, type = "response")
predicted_values
## 1 2 3 4 5 6
## 1.195846e-04 5.704555e-01 8.087847e-01 4.280420e-01 8.162126e-04 6.487938e-01
## 7 8 9 10 11 12
## 4.633344e-01 7.910458e-02 7.654858e-01 3.291957e-01 4.781989e-01 3.703143e-01
## 13 14 15 16 17 18
## 7.178669e-01 2.098517e-01 5.453570e-02 2.399126e-01 3.006613e-03 2.906877e-02
## 19 20 21
## 3.398241e-01 1.887592e-05 4.632693e-01
# Convert predicted probabilities to binary predictions (0 or 1) based on a threshold 0.5
predicted_class <- ifelse(predicted_values > 0.5, 1, 0)
predicted_class
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
## 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
#we can compute accuracy by comparing predicted values with the original data
original <- sm$new_prediction # Assuming new_prediction contains binary response variable (0 or 1)
accuracy <- mean(predicted_class == original)
print(accuracy)
## [1] 0.7142857
#Accuracy Provides an overall measure of model performance. An accuracy of 0.714 indicates that the logistic regression model correctly predicted the outcome (the value of new_prediction) approximately 71.4% correctly.
# Install pROC package (only need to run once)
#install.packages("pROC")
# Load the pROC package
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
length(sm$new_prediction)
## [1] 21
length(predicted_class)
## [1] 21
# Compute ROC curve for the logistic regression model
roc_curve <- roc(original, predicted_class)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC curve
plot(roc_curve, legacy.axes = TRUE, main = "ROC Curve for Logistic Regression Model")
roc(original,predicted_class,plot=TRUE, legacy.axes=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="#377eb8", lwd=4, percent=TRUE, print.auc=TRUE, partial.auc=c(100, 90), auc.polygon = TRUE, auc.polygon.col = "#377eb822", print.auc.x=45)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = original, predictor = predicted_class, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "#377eb8", lwd = 4, print.auc = TRUE, partial.auc = c(100, 90), auc.polygon = TRUE, auc.polygon.col = "#377eb822", print.auc.x = 45)
##
## Data: predicted_class in 14 controls (original 0) < 7 cases (original 1).
## Partial area under the curve (specificity 100%-90%): 1.5%
#ROC curves are typically used to evaluate the performance of binary classification models, where the predicted classes represent binary outcomes (e.g., positive/negative, yes/no). Here the dependent variable is not binary so the model is not appropriate.
colnames(sm)[9]<- "Reddit"
colnames(sm)[6]<- "wa"
colnames(sm)[10] <- "weeksenergy"
str(sm)
## tibble [21 × 11] (S3: tbl_df/tbl/data.frame)
## $ character : chr [1:21] "masinl" "peace" "Patty" "Bunny" ...
## $ Instagram : num [1:21] 3.5 7.73 3.77 5.38 12 ...
## $ LinkedIn : num [1:21] 4 5.2 7 5.317 0.583 ...
## $ SnapChat : num [1:21] 1 3.683 0.533 1.3 0 ...
## $ Twitter : num [1:21] 5 0 0 0 0.667 ...
## $ wa : num [1:21] 1 4.18 9.83 5.3 3 ...
## $ youtube : num [1:21] 2.5 4.25 1.85 2 3.5 7 3 2 4 3 ...
## $ OTT : num [1:21] 14.5 0 2 2 2 3 0 3 3 0 ...
## $ Reddit : num [1:21] 2.5 0 0 0 1 0 0 0 0 0 ...
## $ weeksenergy : num [1:21] 3 3 4 4 3 5 4 3 3 2 ...
## $ new_prediction: num [1:21] 0 0 1 1 0 1 1 0 0 0 ...
# Splitting the dataset into 75% training and 25% test sets
smp_size_raw <- floor(0.75 * nrow(sm))
train_ind_raw <- sample(nrow(sm), size = smp_size_raw)
train_raw.df <- sm[train_ind_raw, ]
test_raw.df <- sm[-train_ind_raw, ]
lda_model <- lda(train_raw.df$weeksenergy ~ Instagram + LinkedIn + SnapChat + Twitter + wa + youtube + OTT + Reddit, data = train_raw.df)
lda_model
## Call:
## lda(train_raw.df$weeksenergy ~ Instagram + LinkedIn + SnapChat +
## Twitter + wa + youtube + OTT + Reddit, data = train_raw.df)
##
## Prior probabilities of groups:
## 2 3 4
## 0.06666667 0.66666667 0.26666667
##
## Group means:
## Instagram LinkedIn SnapChat Twitter wa youtube OTT Reddit
## 2 0.1666667 0.000000 0.000000 0.0000000 1.000000 3.000000 0.000000 0.0000
## 3 6.9033333 3.558333 1.903333 0.9500000 6.101667 3.036667 3.446667 0.3500
## 4 4.7541667 3.663333 3.954167 0.6041667 5.213333 1.597500 0.627500 0.0025
##
## Coefficients of linear discriminants:
## LD1 LD2
## Instagram 0.5739838 -0.11149771
## LinkedIn 0.5953040 0.11895221
## SnapChat 0.2298415 0.12866943
## Twitter 2.5432617 -0.07554907
## wa 0.9426141 -0.06340697
## youtube -1.1123276 -0.29671447
## OTT -1.2808124 -0.07810649
## Reddit 4.4216728 -0.23047684
##
## Proportion of trace:
## LD1 LD2
## 0.9582 0.0418
#Prior probability shows the class distribution of each class in the training data.The data is classified into 4 groups(based on the peoples happiness measured on the scale of 5 (2,3,4,5 are the categories)) and LD1 having few significant coefficients.Among the groups 2,3,4,5 “3” is the most predominant group. #Coefficients of Linear Discriminants: The coefficients of linear discriminants represent the weights assigned to each predictor variable in the discriminant function.
summary(lda_model)
## Length Class Mode
## prior 3 -none- numeric
## counts 3 -none- numeric
## means 24 -none- numeric
## scaling 16 -none- numeric
## lev 3 -none- character
## svd 2 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
#with significant coefficients and no obvious issues with class imbalance, the LDA model is likely acceptable. Here only LinkedIn variable plays a vital role. LDA model is moderately acceptable.
plot(lda_model)
residuals(lda_model)
## NULL
#LDA does not inherently produce residuals
prediction <- predict(lda_model, test_raw.df)
prediction
## $class
## [1] 3 3 4 4 3 4
## Levels: 2 3 4
##
## $posterior
## 2 3 4
## 1 8.907638e-19 9.698888e-01 0.0301112144
## 2 1.735013e-12 9.993755e-01 0.0006244538
## 3 1.489082e-36 3.753828e-01 0.6246172136
## 4 4.638136e-171 3.489399e-14 1.0000000000
## 5 4.005425e-21 9.900696e-01 0.0099303579
## 6 8.371173e-51 1.341644e-01 0.8658355595
##
## $x
## LD1 LD2
## 1 -1.2973254 0.51539547
## 2 -2.7643515 -1.03803039
## 3 2.3870026 0.02113921
## 4 28.3048678 -1.59338036
## 5 -0.8890429 -0.72443861
## 6 5.2164541 -1.64239151
# Predict on the test set
predicted_classes <- predict(lda_model, test_raw.df)$class
# Create a confusion matrix to understand misclassifications
confusion_matrix <- table(predicted_classes, test_raw.df$weeksenergy)
confusion_matrix
##
## predicted_classes 3 4 5
## 2 0 0 0
## 3 0 1 2
## 4 3 0 0
accuracy <- sum(predicted_classes == test_raw.df$weeksenergy) / nrow(test_raw.df)
accuracy
## [1] 0
#Acuuracy is very low suggesting LDA is not suitable to classify the dataset
str(train_raw.df)
## tibble [15 × 11] (S3: tbl_df/tbl/data.frame)
## $ character : chr [1:15] "yh2020" "ak2001" "hahah" "Harvey" ...
## $ Instagram : num [1:15] 8.65 3.33 6 5.3 3.5 ...
## $ LinkedIn : num [1:15] 10 2.5 3 1.52 4 ...
## $ SnapChat : num [1:15] 3.833 0.333 1 14.867 1 ...
## $ Twitter : num [1:15] 0 2.83 1 0 5 ...
## $ wa : num [1:15] 6.15 3.67 3.42 4.02 1 ...
## $ youtube : num [1:15] 4 2.5 2.5 0.54 2.5 3.5 1.85 1 5.15 0.8 ...
## $ OTT : num [1:15] 3 1.5 1 0.51 14.5 ...
## $ Reddit : num [1:15] 0 0 0 0.01 2.5 1 0 0 0 0 ...
## $ weeksenergy : num [1:15] 3 3 3 4 3 3 4 4 3 3 ...
## $ new_prediction: num [1:15] 0 0 0 1 0 0 1 1 0 0 ...
library(klaR)
attach(train_raw.df)
train_raw.df$weeksenergy <- factor(train_raw.df$weeksenergy)
partimat( weeksenergy ~Instagram+LinkedIn+SnapChat+Twitter+youtube+OTT+Reddit, data=train_raw.df, method="lda")